
from langchain.document_loaders import UnstructuredWordDocumentLoader

import sys
sys.path.append('/workspace/qanything_local')
from qanything_kernel.utils.splitter import ChineseTextSplitter
from html2text import html2text



# file_path = '/workspace/qanything_local/qanything_kernel/qanything_server/test/灵活就业人员（含原自由职业者）缴费基数、比例汇总表.docx'
# loader = UnstructuredWordDocumentLoader(file_path, mode="elements")
# texts_splitter = ChineseTextSplitter(pdf=False, sentence_size=200)
# # docs = loader.load_and_split(texts_splitter)
# docs = loader.load()
# print()

# content = ''
# for doc in docs:
#     if 'text_as_html' in doc.metadata.keys():
#         text = html2text(doc.metadata['text_as_html'])
#         content += text + '\n'
#     else:
#         content += doc.page_content + '\n'

# docs[0].page_content = content
# new_docs = [docs[0]]
# print()

# # from html2text import html2text

# # new_docs = html2text(docs)
# # print()




from qanything_kernel.utils.loader.csv_loader import CSVLoader


file_path = r'/workspace/qanything_local/QANY_LOCAL_DB/2024-03-26/“1+1+1”家庭医生签约相关问答.csv'

# loader = CSVLoader(file_path, csv_args={"delimiter": ",", "quotechar": '"'}) 
loader = CSVLoader(file_path, metadata_columns=['keywords'], csv_args={"delimiter": ",", "quotechar": '"'})
docs = loader.load()
print(docs)